In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
Merge 12 months of sales data into a single csv file¶
In [2]:
files =[file for file in os.listdir("D:\projects\sales\Sales_Data")]
for file in files:
    print(file)
all_data.csv
Sales_April_2019.csv
Sales_August_2019.csv
Sales_December_2019.csv
Sales_February_2019.csv
Sales_January_2019.csv
Sales_July_2019.csv
Sales_June_2019.csv
Sales_March_2019.csv
Sales_May_2019.csv
Sales_November_2019.csv
Sales_October_2019.csv
Sales_September_2019.csv
In [4]:
path = "D:\projects\sales\Sales_Data"

#blank dataframe
all_data = pd.DataFrame()

for file in files:
    current_df = pd.read_csv(path+"/"+file)
    all_data = pd.concat([all_data, current_df])
    
all_data.shape
Out[4]:
(373700, 6)
convert it into dataset¶
In [6]:
all_data.to_csv('D:\projects\sales\Sales_Data/all_data.csv',index=False)
Data cleaning and formatting¶
In [7]:
all_data.dtypes
Out[7]:
Order ID            object
Product             object
Quantity Ordered    object
Price Each          object
Order Date          object
Purchase Address    object
dtype: object
In [8]:
all_data.head()
Out[8]:
Order ID Product Quantity Ordered Price Each Order Date Purchase Address
0 176558 USB-C Charging Cable 2 11.95 04/19/19 08:46 917 1st St, Dallas, TX 75001
1 NaN NaN NaN NaN NaN NaN
2 176559 Bose SoundSport Headphones 1 99.99 04/07/19 22:30 682 Chestnut St, Boston, MA 02215
3 176560 Google Phone 1 600 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001
4 176560 Wired Headphones 1 11.99 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001
In [26]:
all_data.isnull().sum()
Out[26]:
Order ID            0
Product             0
Quantity Ordered    0
Price Each          0
Order Date          0
Purchase Address    0
Month               0
dtype: int64
In [27]:
all_data = all_data.dropna(how='all')
all_data.shape
Out[27]:
(371900, 7)
What is the best month for sale?¶
In [28]:
'04/19/19 08:46'.split('/')[0]
Out[28]:
'04'
In [29]:
def month(x):
    return x.split('/')[0]
add month col¶
In [30]:
all_data['Month']=all_data['Order Date'].apply(month)
In [31]:
all_data.dtypes
Out[31]:
Order ID            object
Product             object
Quantity Ordered    object
Price Each          object
Order Date          object
Purchase Address    object
Month               object
dtype: object
In [32]:
all_data['Month']=all_data['Month'].astype(int)
In [33]:
all_data['Month'].unique()
Out[33]:
array([ 4,  5,  8,  9, 12,  1,  2,  3,  7,  6, 11, 10])
In [34]:
filter=all_data['Month']=='Order Date'
len(all_data[~filter])
Out[34]:
371900
In [35]:
all_data=all_data[~filter]
In [36]:
all_data.shape
Out[36]:
(371900, 7)
In [37]:
all_data.head()
Out[37]:
Order ID Product Quantity Ordered Price Each Order Date Purchase Address Month
0 176558 USB-C Charging Cable 2 11.95 04/19/19 08:46 917 1st St, Dallas, TX 75001 4
2 176559 Bose SoundSport Headphones 1 99.99 04/07/19 22:30 682 Chestnut St, Boston, MA 02215 4
3 176560 Google Phone 1 600 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001 4
4 176560 Wired Headphones 1 11.99 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001 4
5 176561 Wired Headphones 1 11.99 04/30/19 09:27 333 8th St, Los Angeles, CA 90001 4
In [70]:
all_data['Month']=all_data['Month'].astype(int)
In [39]:
all_data.dtypes
Out[39]:
Order ID            object
Product             object
Quantity Ordered    object
Price Each          object
Order Date          object
Purchase Address    object
Month                int32
dtype: object
In [40]:
all_data['Price Each']=all_data['Price Each'].astype(float)
In [41]:
all_data['Quantity Ordered']=all_data['Quantity Ordered'].astype(int)
In [42]:
all_data['sales']=all_data['Quantity Ordered']*all_data['Price Each']
all_data.head(5)
Out[42]:
Order ID Product Quantity Ordered Price Each Order Date Purchase Address Month sales
0 176558 USB-C Charging Cable 2 11.95 04/19/19 08:46 917 1st St, Dallas, TX 75001 4 23.90
2 176559 Bose SoundSport Headphones 1 99.99 04/07/19 22:30 682 Chestnut St, Boston, MA 02215 4 99.99
3 176560 Google Phone 1 600.00 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001 4 600.00
4 176560 Wired Headphones 1 11.99 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001 4 11.99
5 176561 Wired Headphones 1 11.99 04/30/19 09:27 333 8th St, Los Angeles, CA 90001 4 11.99
In [43]:
all_data.groupby('Month')['sales'].sum()
Out[43]:
Month
1     3644513.46
2     4404044.84
3     5614200.76
4     6781340.48
5     6305213.50
6     5155604.52
7     5295551.52
8     4488935.76
9     4195120.26
10    7473453.76
11    6399206.40
12    9226886.68
Name: sales, dtype: float64
In [44]:
months=range(1,13)
plt.bar(months,all_data.groupby('Month')['sales'].sum())
plt.xticks(months)
plt.ylabel('Sales in USD ($)')
plt.xlabel('Month number')
plt.show()
Which city has max order¶
In [45]:
'917 1st St, Dallas, TX 75001'.split(',')[1]
Out[45]:
' Dallas'
In [46]:
def city(x):
    return x.split(',')[1]
In [47]:
all_data['city']=all_data['Purchase Address'].apply(city)
In [48]:
all_data.groupby('city')['city'].count()
Out[48]:
city
 Atlanta          29762
 Austin           19810
 Boston           39868
 Dallas           29640
 Los Angeles      59210
 New York City    49752
 Portland         24930
 San Francisco    89464
 Seattle          29464
Name: city, dtype: int64
In [49]:
plt.bar(all_data.groupby('city')['city'].count().index,all_data.groupby('city')['city'].count())
plt.xticks(rotation='vertical')
plt.ylabel('received orders')
plt.xlabel('city names')
plt.show()
In [ ]:
 
What time should we display advertisements to maximise for product purchase?¶
In [ ]:
 
In [50]:
all_data['Order Date'][0].dtype
Out[50]:
dtype('O')
In [69]:
all_data['Hour'] = pd.to_datetime(all_data['Order Date']).dt.hour
In [52]:
keys=[]
hour=[]
for key,hour_df in all_data.groupby('Hour'):
    keys.append(key)
    hour.append(len(hour_df))
In [53]:
plt.grid()
plt.plot(keys,hour)
Out[53]:
[<matplotlib.lines.Line2D at 0x2738f181400>]
between 12pm and 7pm is probably the best time to advertise to maximise product purchase¶
In [ ]:
 

What product sold the most? & Why?¶

In [54]:
all_data.groupby('Product')['Quantity Ordered'].sum().plot(kind='bar')
Out[54]:
<AxesSubplot:xlabel='Product'>
In [55]:
all_data.groupby('Product')['Price Each'].mean()
Out[55]:
Product
20in Monitor                   109.99
27in 4K Gaming Monitor         389.99
27in FHD Monitor               149.99
34in Ultrawide Monitor         379.99
AA Batteries (4-pack)            3.84
AAA Batteries (4-pack)           2.99
Apple Airpods Headphones       150.00
Bose SoundSport Headphones      99.99
Flatscreen TV                  300.00
Google Phone                   600.00
LG Dryer                       600.00
LG Washing Machine             600.00
Lightning Charging Cable        14.95
Macbook Pro Laptop            1700.00
ThinkPad Laptop                999.99
USB-C Charging Cable            11.95
Vareebadd Phone                400.00
Wired Headphones                11.99
iPhone                         700.00
Name: Price Each, dtype: float64
In [56]:
products=all_data.groupby('Product')['Quantity Ordered'].sum().index
quantity=all_data.groupby('Product')['Quantity Ordered'].sum()
prices=all_data.groupby('Product')['Price Each'].mean()
In [57]:
plt.figure(figsize=(40,24))
fig,ax1 = plt.subplots()
ax2=ax1.twinx()
ax1.bar(products, quantity, color='g')
ax2.plot(products, prices, 'b-')
ax1.set_xticklabels(products, rotation='vertical', size=8)
C:\Users\anujd\AppData\Local\Temp\ipykernel_21184\2263540929.py:6: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax1.set_xticklabels(products, rotation='vertical', size=8)
Out[57]:
[Text(0, 0, '20in Monitor'),
 Text(1, 0, '27in 4K Gaming Monitor'),
 Text(2, 0, '27in FHD Monitor'),
 Text(3, 0, '34in Ultrawide Monitor'),
 Text(4, 0, 'AA Batteries (4-pack)'),
 Text(5, 0, 'AAA Batteries (4-pack)'),
 Text(6, 0, 'Apple Airpods Headphones'),
 Text(7, 0, 'Bose SoundSport Headphones'),
 Text(8, 0, 'Flatscreen TV'),
 Text(9, 0, 'Google Phone'),
 Text(10, 0, 'LG Dryer'),
 Text(11, 0, 'LG Washing Machine'),
 Text(12, 0, 'Lightning Charging Cable'),
 Text(13, 0, 'Macbook Pro Laptop'),
 Text(14, 0, 'ThinkPad Laptop'),
 Text(15, 0, 'USB-C Charging Cable'),
 Text(16, 0, 'Vareebadd Phone'),
 Text(17, 0, 'Wired Headphones'),
 Text(18, 0, 'iPhone')]
<Figure size 4000x2400 with 0 Axes>
The top selling product is 'AAA Batteries'. The top selling products seem to have a correlation with the price of the product. The cheaper the product higher the quantity ordered and vice versa.¶
In [58]:
all_data.shape
Out[58]:
(371900, 10)
What products are most often sold together?¶

note: keep orders that have same order Id,are sold mostly together¶

In [59]:
df=all_data[all_data['Order ID'].duplicated(keep=False)]
df.head(20)
Out[59]:
Order ID Product Quantity Ordered Price Each Order Date Purchase Address Month sales city Hour
0 176558 USB-C Charging Cable 2 11.95 04/19/19 08:46 917 1st St, Dallas, TX 75001 4 23.90 Dallas 8
2 176559 Bose SoundSport Headphones 1 99.99 04/07/19 22:30 682 Chestnut St, Boston, MA 02215 4 99.99 Boston 22
3 176560 Google Phone 1 600.00 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001 4 600.00 Los Angeles 14
4 176560 Wired Headphones 1 11.99 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001 4 11.99 Los Angeles 14
5 176561 Wired Headphones 1 11.99 04/30/19 09:27 333 8th St, Los Angeles, CA 90001 4 11.99 Los Angeles 9
6 176562 USB-C Charging Cable 1 11.95 04/29/19 13:03 381 Wilson St, San Francisco, CA 94016 4 11.95 San Francisco 13
7 176563 Bose SoundSport Headphones 1 99.99 04/02/19 07:46 668 Center St, Seattle, WA 98101 4 99.99 Seattle 7
8 176564 USB-C Charging Cable 1 11.95 04/12/19 10:58 790 Ridge St, Atlanta, GA 30301 4 11.95 Atlanta 10
9 176565 Macbook Pro Laptop 1 1700.00 04/24/19 10:38 915 Willow St, San Francisco, CA 94016 4 1700.00 San Francisco 10
10 176566 Wired Headphones 1 11.99 04/08/19 14:05 83 7th St, Boston, MA 02215 4 11.99 Boston 14
11 176567 Google Phone 1 600.00 04/18/19 17:18 444 7th St, Los Angeles, CA 90001 4 600.00 Los Angeles 17
12 176568 Lightning Charging Cable 1 14.95 04/15/19 12:18 438 Elm St, Seattle, WA 98101 4 14.95 Seattle 12
13 176569 27in 4K Gaming Monitor 1 389.99 04/16/19 19:23 657 Hill St, Dallas, TX 75001 4 389.99 Dallas 19
14 176570 AA Batteries (4-pack) 1 3.84 04/22/19 15:09 186 12th St, Dallas, TX 75001 4 3.84 Dallas 15
15 176571 Lightning Charging Cable 1 14.95 04/19/19 14:29 253 Johnson St, Atlanta, GA 30301 4 14.95 Atlanta 14
16 176572 Apple Airpods Headphones 1 150.00 04/04/19 20:30 149 Dogwood St, New York City, NY 10001 4 150.00 New York City 20
17 176573 USB-C Charging Cable 1 11.95 04/27/19 18:41 214 Chestnut St, San Francisco, CA 94016 4 11.95 San Francisco 18
18 176574 Google Phone 1 600.00 04/03/19 19:42 20 Hill St, Los Angeles, CA 90001 4 600.00 Los Angeles 19
19 176574 USB-C Charging Cable 1 11.95 04/03/19 19:42 20 Hill St, Los Angeles, CA 90001 4 11.95 Los Angeles 19
20 176575 AAA Batteries (4-pack) 1 2.99 04/27/19 00:30 433 Hill St, New York City, NY 10001 4 2.99 New York City 0
In [60]:
#create grouped col 
df['Grouped'] = df.groupby('Order ID')['Product'].transform(lambda x: ','.join(x))
In [61]:
df.head()
Out[61]:
Order ID Product Quantity Ordered Price Each Order Date Purchase Address Month sales city Hour Grouped
0 176558 USB-C Charging Cable 2 11.95 04/19/19 08:46 917 1st St, Dallas, TX 75001 4 23.90 Dallas 8 USB-C Charging Cable,USB-C Charging Cable
2 176559 Bose SoundSport Headphones 1 99.99 04/07/19 22:30 682 Chestnut St, Boston, MA 02215 4 99.99 Boston 22 Bose SoundSport Headphones,Bose SoundSport Hea...
3 176560 Google Phone 1 600.00 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001 4 600.00 Los Angeles 14 Google Phone,Wired Headphones,Google Phone,Wir...
4 176560 Wired Headphones 1 11.99 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001 4 11.99 Los Angeles 14 Google Phone,Wired Headphones,Google Phone,Wir...
5 176561 Wired Headphones 1 11.99 04/30/19 09:27 333 8th St, Los Angeles, CA 90001 4 11.99 Los Angeles 9 Wired Headphones,Wired Headphones
In [62]:
df.shape
Out[62]:
(371900, 11)
In [63]:
#lets drop out all duplicate Order ID
df2 = df.drop_duplicates(subset=['Order ID'])
In [64]:
df2['Grouped'].value_counts()[0:5].plot.pie()
Out[64]:
<AxesSubplot:ylabel='Grouped'>
In [65]:
import plotly.graph_objs as go
from plotly.offline import iplot
In [66]:
values=df2['Grouped'].value_counts()[0:5]
labels=df['Grouped'].value_counts()[0:5].index
In [67]:
trace=go.Pie(labels=labels, values=values,
               hoverinfo='label+percent', textinfo='value', 
               textfont=dict(size=25),
              pull=[0, 0, 0,0.2, 0]
               )
In [68]:
iplot([trace])
In [ ]:
 
In [ ]:
 
In [ ]: